Elsőként az Iris adathalmazt választottam. Programozási nyelvnek pedig az R-t azon belül ay H2O csomagot. Egy picit ‘overkill’ a feladathoz de hasznosnak találtam kipróbálni mivel munkában osztott rendszeren dolgozunk(Hadoop) és H2O-val lehet HDFS-ben tárolt nagyobb adathalmazokat is feldolgozni(összekötöttem a hasznost a hasznossal). Hátránya is van: kevés modell van implementálva. Például az SVM hiánzyik mivel nehezebben párhuzamositható osztott rendszeren.

Az iris adathalmaz kicsi viszont szerintem egész jól lehet majd mérni az osztályozók pontosságát mivel az egyik osztály elválasztható a másik 2től lineárisan, de az utóbbi 2 nem szétválasztható.(lásd alábbi ábra 3 változóval)

library(plotly)
plot_ly(iris,x=~Petal.Length,y=~Sepal.Length, z=~Petal.Width, color = ~Species, type="scatter3d",marker = list(opacity=0.5))

Az adatok

Csupán 4 valós változó van a virágok szirmainak méréseivel valamint a virág tipusa. Előfeldolgozást sem igényel az adathalmaz. A tipus átalakitható “one hot encoding” modszerrel ha éppenséggel valamelyik mérést szeretnénk becsülni a többi paraméter alapján. Az adathalmaz az R környezet része, nem volt szükség letölteni.

library(h2o)
h2o.init(nthreads = 8) #kapcsolódni-elinditani a szervert
## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     /tmp/Rtmp02gYGL/h2o_thecodewriter_started_from_r.out
##     /tmp/Rtmp02gYGL/h2o_thecodewriter_started_from_r.err
## 
## 
## Starting H2O JVM and connecting: .. Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         2 seconds 613 milliseconds 
##     H2O cluster version:        3.10.3.3 
##     H2O cluster version age:    5 days  
##     H2O cluster name:           H2O_started_from_R_thecodewriter_aih502 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   26.67 GB 
##     H2O cluster total cores:    32 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     R Version:                  R version 3.3.2 (2016-10-31)
irisdf <-as.h2o(iris) #betolteni az adatokat  a 'szerverbe'
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
summary(irisdf)
##  Sepal.Length    Sepal.Width     Petal.Length    Petal.Width     
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.1000  
##  1st Qu.:5.099   1st Qu.:2.799   1st Qu.:1.596   1st Qu.:0.2992  
##  Median :5.798   Median :2.998   Median :4.348   Median :1.3000  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.1993  
##  3rd Qu.:6.399   3rd Qu.:3.298   3rd Qu.:5.095   3rd Qu.:1.7992  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.5000  
##  Species       
##  setosa    :50 
##  versicolor:50 
##  virginica :50 
##                
##                
## 
isplit<-h2o.splitFrame(irisdf,ratios = .8, destination_frames = c("train","test"),seed =121) #felosztas
itrain <- isplit[[1]]
itest <- isplit[[2]]
summary(itrain)
##  Sepal.Length    Sepal.Width     Petal.Length    Petal.Width     
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.1000  
##  1st Qu.:5.099   1st Qu.:2.799   1st Qu.:1.496   1st Qu.:0.2992  
##  Median :5.799   Median :2.998   Median :4.248   Median :1.3000  
##  Mean   :5.838   Mean   :3.078   Mean   :3.708   Mean   :1.1700  
##  3rd Qu.:6.398   3rd Qu.:3.324   3rd Qu.:5.095   3rd Qu.:1.7992  
##  Max.   :7.700   Max.   :4.400   Max.   :6.900   Max.   :2.5000  
##  Species       
##  setosa    :42 
##  versicolor:39 
##  virginica :39 
##                
##                
## 
summary(itest)
##  Sepal.Length    Sepal.Width     Petal.Length    Petal.Width     
##  Min.   :4.500   Min.   :2.200   Min.   :1.300   Min.   :0.2000  
##  1st Qu.:5.398   1st Qu.:2.724   1st Qu.:2.196   1st Qu.:0.5479  
##  Median :5.897   Median :2.949   Median :4.498   Median :1.4489  
##  Mean   :5.863   Mean   :2.973   Mean   :3.957   Mean   :1.3167  
##  3rd Qu.:6.299   3rd Qu.:3.200   3rd Qu.:4.972   3rd Qu.:1.7985  
##  Max.   :7.900   Max.   :3.900   Max.   :6.400   Max.   :2.5000  
##  Species       
##  versicolor:11 
##  virginica :11 
##  setosa    : 8 
##                
##                
## 

A split nem tökéletesen arányos mivel nagy adathalmazokra van kitalálva és arra megfeleloen mukodik.

Készitetek egy normalizált adathalmazt is az eredménzek összehasonlitása érdekében.

library(clusterSim) #normalizalashoz hasznalt csomag
## Loading required package: cluster
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
## 
##     select
## 
## This is package 'modeest' written by P. PONCET.
## For a complete list of functions, use 'library(help = "modeest")' or 'help.start()'.
iris_norm <- iris
iris_norm$Sepal.Length <-data.Normalization(iris_norm$Sepal.Length,type = 'n12') # n12 = ((x-mean)/sqrt(sum((x-mean)^2))) 
iris_norm$Sepal.Width <-data.Normalization(iris_norm$Sepal.Width,type = 'n12')
iris_norm$Petal.Length <-data.Normalization(iris_norm$Petal.Length,type = 'n12')
iris_norm$Petal.Width <-data.Normalization(iris_norm$Petal.Width,type = 'n12')
irisnormdf <-as.h2o(iris_norm)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
isplit<-h2o.splitFrame(irisnormdf,ratios = .8, destination_frames = c("train","test"),seed =121) #felosztas
intrain <- isplit[[1]]
intest <- isplit[[2]]
summary(intrain)
## Warning in summary.H2OFrame(intrain): Approximated quantiles
## computed! If you are interested in exact quantiles, please pass the
## `exact_quantiles=TRUE` parameter.
##  Sepal.Length         Sepal.Width         Petal.Length       
##  Min.   :-0.1526868   Min.   :-0.198731   Min.   :-0.127992  
##  1st Qu.:-0.0736392   1st Qu.:-0.048517   1st Qu.:-0.104992  
##  Median :-0.0043465   Median :-0.011077   Median : 0.022737  
##  Mean   :-0.0004947   Mean   : 0.003947   Mean   :-0.002305  
##  3rd Qu.: 0.0548551   3rd Qu.: 0.050046   3rd Qu.: 0.062028  
##  Max.   : 0.1836859   Max.   : 0.252361   Max.   : 0.145813  
##  Petal.Width         Species       
##  Min.   :-0.118153   setosa    :42 
##  1st Qu.:-0.096744   versicolor:39 
##  Median : 0.010819   virginica :39 
##  Mean   :-0.003153                 
##  3rd Qu.: 0.064472                 
##  Max.   : 0.139792
summary(intest)
## Warning in summary.H2OFrame(intest): Approximated quantiles
## computed! If you are interested in exact quantiles, please pass the
## `exact_quantiles=TRUE` parameter.
##  Sepal.Length        Sepal.Width        Petal.Length      
##  Min.   :-0.132900   Min.   :-0.16114   Min.   :-0.11407  
##  1st Qu.:-0.044098   1st Qu.:-0.06257   1st Qu.:-0.07247  
##  Median : 0.005349   Median :-0.02039   Median : 0.03433  
##  Mean   : 0.001979   Mean   :-0.01579   Mean   : 0.00922  
##  3rd Qu.: 0.045041   3rd Qu.: 0.02674   3rd Qu.: 0.05634  
##  Max.   : 0.203473   Max.   : 0.15838   Max.   : 0.12261  
##  Petal.Width        Species       
##  Min.   :-0.10741   versicolor:11 
##  1st Qu.:-0.07002   virginica :11 
##  Median : 0.02682   setosa    : 8 
##  Mean   : 0.01261                 
##  3rd Qu.: 0.06440                 
##  Max.   : 0.13979
plot_ly(iris_norm,x=~Petal.Length,y=~Sepal.Length, z=~Petal.Width, color = ~Species, type="scatter3d",marker = list(opacity=0.5))
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode

Az osztályozók

A 3 osztályozó amit választottam: Naive Bayes, Neuralis háló és Random forest .

Bayes

bayes <- h2o.naiveBayes(x=1:4,y=5,itrain)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |======================================================           |  83%
  |                                                                       
  |=================================================================| 100%
bayes@model$training_metrics
## H2OMultinomialMetrics: naivebayes
## ** Reported on training data. **
## 
## Training Set Metrics: 
## =====================
## 
## Extract training frame with `h2o.getFrame("train")`
## MSE: (Extract with `h2o.mse`) 0.0220811
## RMSE: (Extract with `h2o.rmse`) 0.1485971
## Logloss: (Extract with `h2o.logloss`) 0.07090578
## Mean Per-Class Error: 0.03418803
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,train = TRUE)`)
## =========================================================================
## Confusion Matrix: vertical: actual; across: predicted
##            setosa versicolor virginica  Error      Rate
## setosa         42          0         0 0.0000 =  0 / 42
## versicolor      0         37         2 0.0513 =  2 / 39
## virginica       0          2        37 0.0513 =  2 / 39
## Totals         42         39        39 0.0333 = 4 / 120
## 
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,train = TRUE)`
## =======================================================================
## Top-3 Hit Ratios: 
##   k hit_ratio
## 1 1  0.966667
## 2 2  1.000000
## 3 3  1.000000
bayes@model$apriori
## A Priori Response Probabilities: 
##     setosa versicolor virginica
## 1 0.350000   0.325000  0.325000
bayes@model$pcond
## [[1]]
## Sepal.Length: 
##   y_by_sepallength      mean  std_dev
## 1           setosa -0.081785 0.035632
## 2       versicolor  0.005860 0.053590
## 3        virginica  0.080694 0.056365
## 
## [[2]]
## Sepal.Width: 
##   y_by_sepalwidth      mean  std_dev
## 1          setosa  0.076489 0.065613
## 2      versicolor -0.058970 0.060644
## 3       virginica -0.011258 0.047331
## 
## [[3]]
## Petal.Length: 
##   y_by_petallength      mean  std_dev
## 1           setosa -0.106446 0.008503
## 2       versicolor  0.021821 0.023518
## 3        virginica  0.085721 0.025013
## 
## [[4]]
## Petal.Width: 
##   y_by_petalwidth      mean  std_dev
## 1          setosa -0.102544 0.011917
## 2      versicolor  0.011646 0.021122
## 3       virginica  0.089085 0.028855
perfbayes <- h2o.performance(bayes,itest)
h2o.confusionMatrix(perfbayes)
## Confusion Matrix: vertical: actual; across: predicted
##            setosa versicolor virginica  Error     Rate
## setosa          8          0         0 0.0000 =  0 / 8
## versicolor      0         10         1 0.0909 = 1 / 11
## virginica       0          2         9 0.1818 = 2 / 11
## Totals          8         12        10 0.1000 = 3 / 30

Neuralis halo

nn <- h2o.deeplearning(x=1:4,y=5,itrain,hidden = c(10),epochs = 1000,diagnostics=TRUE,variable_importances = TRUE,export_weights_and_biases=TRUE,standardize = FALSE) #egy 10es rejtett reteg
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |=================================================================| 100%
nn@model$training_metrics
## H2OMultinomialMetrics: deeplearning
## ** Reported on training data. **
## ** Metrics reported on full training frame **
## 
## Training Set Metrics: 
## =====================
## 
## Extract training frame with `h2o.getFrame("train")`
## MSE: (Extract with `h2o.mse`) 0.04500374
## RMSE: (Extract with `h2o.rmse`) 0.2121409
## Logloss: (Extract with `h2o.logloss`) 0.1460662
## Mean Per-Class Error: 0.05982906
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,train = TRUE)`)
## =========================================================================
## Confusion Matrix: vertical: actual; across: predicted
##            setosa versicolor virginica  Error      Rate
## setosa         42          0         0 0.0000 =  0 / 42
## versicolor      0         39         0 0.0000 =  0 / 39
## virginica       0          7        32 0.1795 =  7 / 39
## Totals         42         46        32 0.0583 = 7 / 120
## 
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,train = TRUE)`
## =======================================================================
## Top-3 Hit Ratios: 
##   k hit_ratio
## 1 1  0.941667
## 2 2  1.000000
## 3 3  1.000000
nn@model$model_summary
## Status of Neuron Layers: predicting Species, 3-class classification, multinomial distribution, CrossEntropy loss, 83 weights/biases, 4.0 KB, 120,000 training samples, mini-batch size 1
##   layer units      type dropout       l1       l2 mean_rate rate_rms
## 1     1     4     Input  0.00 %                                     
## 2     2    10 Rectifier  0.00 % 0.000000 0.000000  0.022304 0.010972
## 3     3     3   Softmax         0.000000 0.000000  0.328551 0.452897
##   momentum mean_weight weight_rms mean_bias bias_rms
## 1                                                   
## 2 0.000000    0.353523   2.347559  1.075514 0.880512
## 3 0.000000   -0.494950   3.088834 -0.023552 0.103008
h2o.varimp_plot(nn)

h2o.weights(nn,matrix_id = 1)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1  -0.09561024 -0.25414366    1.7936552    2.317148
## 2  -1.19130766  1.48091447    0.5580021    1.819052
## 3   1.01845717  0.06272632   -0.8774408   -2.536774
## 4  -0.37841806  2.10815573   -3.3898327   -3.984785
## 5   1.79173267 -0.74941695    2.3940663    4.084359
## 6  -0.88762665  0.50654703    2.2416735    2.467150
## 
## [10 rows x 4 columns]
h2o.biases(nn,vector_id = 1)
##            C1
## 1 -0.04846028
## 2  0.29538057
## 3  1.58846495
## 4  1.86967185
## 5  0.49847456
## 6  0.19380591
## 
## [10 rows x 1 column]
perfnn <- h2o.performance(nn,itest)
h2o.confusionMatrix(perfnn)
## Confusion Matrix: vertical: actual; across: predicted
##            setosa versicolor virginica  Error     Rate
## setosa          8          0         0 0.0000 =  0 / 8
## versicolor      0         11         0 0.0000 = 0 / 11
## virginica       0          3         8 0.2727 = 3 / 11
## Totals          8         14         8 0.1000 = 3 / 30

Random forest

rf <- h2o.randomForest(x=1:4,y=5,itrain, ntrees = 40) #20 fabol allo RF
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
rf@model$training_metrics
## H2OMultinomialMetrics: drf
## ** Reported on training data. **
## ** Metrics reported on Out-Of-Bag training samples **
## 
## Training Set Metrics: 
## =====================
## 
## Extract training frame with `h2o.getFrame("train")`
## MSE: (Extract with `h2o.mse`) 0.01976119
## RMSE: (Extract with `h2o.rmse`) 0.1405745
## Logloss: (Extract with `h2o.logloss`) 0.06214647
## Mean Per-Class Error: 0.03418803
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,train = TRUE)`)
## =========================================================================
## Confusion Matrix: vertical: actual; across: predicted
##            setosa versicolor virginica  Error      Rate
## setosa         42          0         0 0.0000 =  0 / 42
## versicolor      0         37         2 0.0513 =  2 / 39
## virginica       0          2        37 0.0513 =  2 / 39
## Totals         42         39        39 0.0333 = 4 / 120
## 
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,train = TRUE)`
## =======================================================================
## Top-3 Hit Ratios: 
##   k hit_ratio
## 1 1  0.966667
## 2 2  1.000000
## 3 3  1.000000
rf@model$model_summary
## Model Summary: 
##   number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1              40                      120               18311         1
##   max_depth mean_depth min_leaves max_leaves mean_leaves
## 1         6    3.01667          2         12     4.48333
h2o.varimp_plot(rf)

perfrf <- h2o.performance(rf,itest)
h2o.confusionMatrix(perfrf)
## Confusion Matrix: vertical: actual; across: predicted
##            setosa versicolor virginica  Error     Rate
## setosa          8          0         0 0.0000 =  0 / 8
## versicolor      0         10         1 0.0909 = 1 / 11
## virginica       0          2         9 0.1818 = 2 / 11
## Totals          8         12        10 0.1000 = 3 / 30
gendata <- function(n,params){
  sl1 <- params[[1]][1,] #s-sepal,p-petal l-lenght w-width 1-setosa
  sw1 <- params[[2]][1,]
  pl1 <- params[[3]][1,]
  pw1 <- params[[4]][1,]
  dfsetosa<-cbind(rnorm(n=n,mean = sl1["mean"]$mean,sd = sl1["std_dev"]$std_dev),
                  rnorm(n=n,mean = sw1["mean"]$mean,sd = sw1["std_dev"]$std_dev),
                  rnorm(n=n,mean = pl1["mean"]$mean,sd = pl1["std_dev"]$std_dev),
                  rnorm(n=n,mean = pw1["mean"]$mean,sd = pw1["std_dev"]$std_dev),
                  rep("setosa",times=n))
  dfsetosa <- as.data.frame(dfsetosa)
  names(dfsetosa)<-c("Sepal.Length","Sepal.Width","Petal.Length","Petal.Width","Species")
  
  sl1 <- params[[1]][2,] #2-versicolor
  sw1 <- params[[2]][2,]
  pl1 <- params[[3]][2,]
  pw1 <- params[[4]][2,]
  dfversicolor<-cbind(rnorm(n=n,mean = sl1["mean"]$mean,sd = sl1["std_dev"]$std_dev),
                  rnorm(n=n,mean = sw1["mean"]$mean,sd = sw1["std_dev"]$std_dev),
                  rnorm(n=n,mean = pl1["mean"]$mean,sd = pl1["std_dev"]$std_dev),
                  rnorm(n=n,mean = pw1["mean"]$mean,sd = pw1["std_dev"]$std_dev),
                  rep("versicolor",times=n))
  dfversicolor<- as.data.frame(dfversicolor)
  names(dfversicolor)<-c("Sepal.Length","Sepal.Width","Petal.Length","Petal.Width","Species")
  
  sl1 <- params[[1]][3,] #3-virginica
  sw1 <- params[[2]][3,]
  pl1 <- params[[3]][3,]
  pw1 <- params[[4]][3,]
  dfvirginica<-cbind(rnorm(n=n,mean = sl1["mean"]$mean,sd = sl1["std_dev"]$std_dev),
                  rnorm(n=n,mean = sw1["mean"]$mean,sd = sw1["std_dev"]$std_dev),
                  rnorm(n=n,mean = pl1["mean"]$mean,sd = pl1["std_dev"]$std_dev),
                  rnorm(n=n,mean = pw1["mean"]$mean,sd = pw1["std_dev"]$std_dev),
                  rep("virginica",times=n))
  dfvirginica<- as.data.frame(dfvirginica)
  names(dfvirginica)<-c("Sepal.Length","Sepal.Width","Petal.Length","Petal.Width","Species")
  
  
  rbind(dfsetosa,dfversicolor,dfvirginica,stringsAsFactors=FALSE)
}

testd <- gendata(n=200,bayes@model$pcond) 
testd$Sepal.Length <- as.double(as.character(testd$Sepal.Length))
testd$Petal.Length <- as.double(as.character(testd$Petal.Length))
testd$Sepal.Width <- as.double(as.character(testd$Sepal.Width))
testd$Petal.Width <- as.double(as.character(testd$Petal.Width))
plot_ly(testd,x=~Petal.Length,y=~Sepal.Length, z=~Petal.Width, color = ~Species,marker = list(opacity=0.5))
## No trace type specified:
##   Based on info supplied, a 'scatter3d' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter3d
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
td<-as.h2o(testd)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
h2o.performance(nn,td)
## H2OMultinomialMetrics: deeplearning
## 
## Test Set Metrics: 
## =====================
## 
## MSE: (Extract with `h2o.mse`) 0.04503741
## RMSE: (Extract with `h2o.rmse`) 0.2122202
## Logloss: (Extract with `h2o.logloss`) 0.1740993
## Mean Per-Class Error: 0.05833333
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>, <data>)`)
## =========================================================================
## Confusion Matrix: vertical: actual; across: predicted
##            setosa versicolor virginica  Error       Rate
## setosa        200          0         0 0.0000 =  0 / 200
## versicolor      0        200         0 0.0000 =  0 / 200
## virginica       0         35       165 0.1750 = 35 / 200
## Totals        200        235       165 0.0583 = 35 / 600
## 
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>, <data>)`
## =======================================================================
## Top-3 Hit Ratios: 
##   k hit_ratio
## 1 1  0.941667
## 2 2  1.000000
## 3 3  1.000000
h2o.performance(bayes,td)
## H2OMultinomialMetrics: naivebayes
## 
## Test Set Metrics: 
## =====================
## 
## MSE: (Extract with `h2o.mse`) 0.006274718
## RMSE: (Extract with `h2o.rmse`) 0.07921311
## Logloss: (Extract with `h2o.logloss`) 0.02144053
## Mean Per-Class Error: 0.01
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>, <data>)`)
## =========================================================================
## Confusion Matrix: vertical: actual; across: predicted
##            setosa versicolor virginica  Error      Rate
## setosa        200          0         0 0.0000 = 0 / 200
## versicolor      0        196         4 0.0200 = 4 / 200
## virginica       0          2       198 0.0100 = 2 / 200
## Totals        200        198       202 0.0100 = 6 / 600
## 
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>, <data>)`
## =======================================================================
## Top-3 Hit Ratios: 
##   k hit_ratio
## 1 1  0.990000
## 2 2  1.000000
## 3 3  1.000000
h2o.performance(rf,td)
## H2OMultinomialMetrics: drf
## 
## Test Set Metrics: 
## =====================
## 
## MSE: (Extract with `h2o.mse`) 0.02130447
## RMSE: (Extract with `h2o.rmse`) 0.1459605
## Logloss: (Extract with `h2o.logloss`) 0.08037285
## Mean Per-Class Error: 0.02333333
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>, <data>)`)
## =========================================================================
## Confusion Matrix: vertical: actual; across: predicted
##            setosa versicolor virginica  Error       Rate
## setosa        200          0         0 0.0000 =  0 / 200
## versicolor      1        192         7 0.0400 =  8 / 200
## virginica       0          6       194 0.0300 =  6 / 200
## Totals        201        198       201 0.0233 = 14 / 600
## 
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>, <data>)`
## =======================================================================
## Top-3 Hit Ratios: 
##   k hit_ratio
## 1 1  0.976667
## 2 2  1.000000
## 3 3  1.000000
testd<-cbind(testd,as.data.frame(h2o.predict(nn,td)))
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
testd$val <- ifelse(testd$Species == testd$predict,"+","-")
plot_ly(testd,x=~Petal.Length,y=~Sepal.Length, z=~Petal.Width, color = ~val,colors=c("red","green"),marker = list(opacity=0.2),text=~paste(Species," ",predict))
## No trace type specified:
##   Based on info supplied, a 'scatter3d' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter3d
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
nn <- h2o.deeplearning(x=2:4,y=5,irisdf,hidden = c(10),epochs = 1000,diagnostics=TRUE,variable_importances = TRUE,export_weights_and_biases=TRUE) 
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
nn@model$training_metrics
## H2OMultinomialMetrics: deeplearning
## ** Reported on training data. **
## ** Metrics reported on full training frame **
## 
## Training Set Metrics: 
## =====================
## 
## Extract training frame with `h2o.getFrame("iris")`
## MSE: (Extract with `h2o.mse`) 0.01465887
## RMSE: (Extract with `h2o.rmse`) 0.1210738
## Logloss: (Extract with `h2o.logloss`) 0.04474672
## Mean Per-Class Error: 0.02666667
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,train = TRUE)`)
## =========================================================================
## Confusion Matrix: vertical: actual; across: predicted
##            setosa versicolor virginica  Error      Rate
## setosa         50          0         0 0.0000 =  0 / 50
## versicolor      0         47         3 0.0600 =  3 / 50
## virginica       0          1        49 0.0200 =  1 / 50
## Totals         50         48        52 0.0267 = 4 / 150
## 
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,train = TRUE)`
## =======================================================================
## Top-3 Hit Ratios: 
##   k hit_ratio
## 1 1  0.973333
## 2 2  1.000000
## 3 3  1.000000
#nn@model$model_summary
h2o.varimp_plot(nn)

nn@model$variable_importances
## Variable Importances: 
##       variable relative_importance scaled_importance percentage
## 1  Petal.Width            1.000000          1.000000   0.387752
## 2 Petal.Length            0.971460          0.971460   0.376685
## 3  Sepal.Width            0.607509          0.607509   0.235563
rf <- h2o.randomForest(x=1:4,y=5,itrain, ntrees = 200) 
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |===                                                              |   5%
  |                                                                       
  |=================================================================| 100%
rf@model$training_metrics
## H2OMultinomialMetrics: drf
## ** Reported on training data. **
## ** Metrics reported on Out-Of-Bag training samples **
## 
## Training Set Metrics: 
## =====================
## 
## Extract training frame with `h2o.getFrame("train")`
## MSE: (Extract with `h2o.mse`) 0.02046987
## RMSE: (Extract with `h2o.rmse`) 0.143073
## Logloss: (Extract with `h2o.logloss`) 0.06656551
## Mean Per-Class Error: 0.03418803
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,train = TRUE)`)
## =========================================================================
## Confusion Matrix: vertical: actual; across: predicted
##            setosa versicolor virginica  Error      Rate
## setosa         42          0         0 0.0000 =  0 / 42
## versicolor      0         37         2 0.0513 =  2 / 39
## virginica       0          2        37 0.0513 =  2 / 39
## Totals         42         39        39 0.0333 = 4 / 120
## 
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,train = TRUE)`
## =======================================================================
## Top-3 Hit Ratios: 
##   k hit_ratio
## 1 1  0.966667
## 2 2  1.000000
## 3 3  1.000000
rf@model$model_summary
## Model Summary: 
##   number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1             200                      600               92160         1
##   max_depth mean_depth min_leaves max_leaves mean_leaves
## 1         9    3.14667          2         13     4.50333
h2o.varimp_plot(rf)

x <- seq(0,8,by=0.1)
y <- seq(0,7,by=0.1)
z <- seq(0,3,by=0.1)
gr<-expand.grid(Sepal.Length=x,Petal.Length=y,Petal.Width=z)
grid<-as.h2o(gr,destination_frame = "grid")
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
prrf<-as.data.frame(h2o.predict(rf,grid))
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
prnn<-as.data.frame(h2o.predict(nn,grid))
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
plot_ly(gr,x=~Petal.Length,y=~Sepal.Length, z=~Petal.Width, color = prrf$predict,marker = list(opacity=0.2))